Chargement des packages R

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

Analyse d’ADMIXTURE - erreurs de CV

Admixture non supervisée

SeqApiPop - 629 échantillons - MAF > 0.01

LD pruning = 0.3 (fenêtre de 1749 SNPs et pas de 175 bp)

 # LD03 - CV Error plot
setwd("~/Documents/Stage_NB/data/maf001_LD03")

cv_error <- read.table("SeqApiPop_629_maf001_LD03_1.cv.error", header = F)

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:30) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('SeqApiPop_629_maf001_LD03_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure 
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

point_min <- merge_cv_error[which.min(merge_cv_error[, 2]), ]

# cv error boxplot
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.475, 0.480, 0.485, 0.490, 0.495, 0.500, 0.505),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.475, 0.505))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# jitter plot
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.475, 0.480, 0.485, 0.490, 0.495, 0.500, 0.505),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) + 
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.475, 0.505))

# Filtrer les données pour inclure uniquement K = 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
filtered_data <- merge_cv_error[merge_cv_error[,1] %in% c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12), ]

# Créer le jitter plot avec les données filtrées
ggplot(filtered_data, aes(x = factor(V1), y = V2)) +
  geom_hline(
    yintercept = c(0.475, 0.480, 0.485, 0.490, 0.495, 0.500),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) + 
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.475, 0.5))

LD pruning = 0.2 (fenêtre de 1749 SNPs et pas de 175 bp)

##### LD02
setwd("~/Documents/Stage_NB/data/maf001_LD02")

cv_error <- read.table("SeqApiPop_629_maf001_LD02_1.cv.error", header = F)

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:10) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('SeqApiPop_629_maf001_LD02_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure 
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

point_min <- merge_cv_error[which.min(merge_cv_error[, 2]), ]

# boxplot LD02
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.430, 0.435, 0.440, 0.445, 0.450),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.430, 0.450))

# jitter plot LD02
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.430, 0.435, 0.440, 0.445, 0.450),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.430, 0.450))

LD pruning = 0.1 (fenêtre de 1749 SNPs et pas de 175 bp)

##### LD01
setwd("~/Documents/Stage_NB/data/maf001_LD01")

cv_error <- read.table("SeqApiPop_629_maf001_LD01_1.cv.error", header = F)

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:10) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('SeqApiPop_629_maf001_LD01_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure 
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

point_min <- merge_cv_error[which.min(merge_cv_error[, 2]), ]

# boxplot LD01
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.370, 0.375, 0.380, 0.385, 0.390),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.370, 0.390))

# jitter plot LD01
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.370, 0.375, 0.380, 0.385, 0.390),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.370, 0.390))

LD pruning = 0.05 (fenêtre de 1749 SNPs et pas de 175 bp)

##### LD005
setwd("~/Documents/Stage_NB/data/maf001_LD005")

cv_error <- read.table("SeqApiPop_629_maf001_LD005_1.cv.error", header = F)

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:10) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('SeqApiPop_629_maf001_LD005_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure 
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

# boxplot LD01
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.460, 0.465, 0.470, 0.475, 0.480),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.460, 0.480))

# jitter boxplot LD01
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.460, 0.465, 0.470, 0.475, 0.480),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.460, 0.480))

LD pruning = 0.04 (fenêtre de 1749 SNPs et pas de 175 bp)

##### LD004
setwd("~/Documents/Stage_NB/data/maf001_LD004")

cv_error <- read.table("SeqApiPop_629_maf001_LD004_1.cv.error", header = F)

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:10) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('SeqApiPop_629_maf001_LD004_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure 
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

# CV error boxplot LD004
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.575, 0.58, 0.585, 0.59, 0.595, 0.6),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.575, 0.6))

# jitter plot LD004
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.575, 0.58, 0.585, 0.59, 0.595, 0.6),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.575, 0.6))

LD pruning = 0.03 (fenêtre de 1749 SNPs et pas de 175 bp)

##### LD003
setwd("~/Documents/Stage_NB/data/maf001_LD003")

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:10) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('SeqApiPop_629_maf001_LD003_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

# CV error boxplot LD01
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.78, 0.785, 0.79, 0.795, 0.8, 0.805, 0.81, 0.815),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.78, 0.815))

# jitter boxplot LD01
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.78, 0.785, 0.79, 0.795, 0.8, 0.805, 0.81, 0.815),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.78, 0.815))

LD pruning = 0.01 (fenêtre de 1749 SNPs et pas de 175 bp)

##### LD001
setwd("~/Documents/Stage_NB/data/maf001_LD001")

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:10) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('SeqApiPop_629_maf001_LD001_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure 
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

# CV error boxplot LD01
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.895, 0.9, 0.905, 0.91, 0.915, 0.92, 0.925, 0.93, 0.935, 0.94, 0.945),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.895, 0.945))

# jitter boxplot LD01
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.895, 0.9, 0.905, 0.91, 0.915, 0.92, 0.925, 0.93, 0.935, 0.94, 0.945),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.895, 0.945))

SeqApiPop - 561 échantillons - MAF > 0.01

LD pruning = 0.3 (fenêtre de 1749 SNPs et pas de 175 bp)

# LD03 - CV Error plot
setwd("~/Documents/Stage_NB/data/SeqApiPop_561_maf001_LD03")

cv_error <- read.table("SeqApiPop_561_maf001_LD03_1.cv.error", header = F)

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:30) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('SeqApiPop_561_maf001_LD03_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure 
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer  le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

point_min <- merge_cv_error[which.min(merge_cv_error[, 2]), ]

# CV error boxplot
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.46, 0.465, 0.47,0.475, 0.480, 0.485, 0.490),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.46, 0.49))

# jitter boxplot
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.46, 0.465, 0.47,0.475, 0.480, 0.485, 0.490),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) + 
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.46, 0.49))

# Filtrer les données pour inclure uniquement les valeurs de K spécifiées
filtered_data <- merge_cv_error[merge_cv_error[,1] %in% c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12), ]

# Créer le jitter plot avec les données filtrées
ggplot(filtered_data, aes(x = factor(V1), y = V2)) +
  geom_hline(
    yintercept = c(0.46, 0.465, 0.47, 0.475, 0.480, 0.485),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) + 
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.4625, 0.485))

SeqApiPop - 629 échantillons - SNPsBeeMuSe filtered

No LD pruning - 10030 SNPs

setwd("~/Documents/Stage_NB/data/SeqApiPop_629_SNPsBeeMuSe")

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:30) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('SeqApiPop_629_SNPsBeeMuSe_filtered_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure - data - frame
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

#box plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.72,0.725,0.73,0.735,0.74,0.745,0.75,0.755,0.76,0.765,0.77,0.775,0.78,0.785,0.79,0.795,0.8),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.72, 0.8))

#jitter plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.72,0.725,0.73,0.735,0.74,0.745,0.75,0.755,0.76,0.765,0.77,0.775,0.78,0.785,0.79,0.795,0.8),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.72, 0.8))

# Filtrer les données pour inclure uniquement les valeurs de K de 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
filtered_data <- merge_cv_error[merge_cv_error[,1] %in%  c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12), ]

# Créer le jitter plot avec les données filtrées
ggplot(filtered_data, aes(x = factor(V1), y = V2)) +
  geom_hline(
    yintercept = c(0.72, 0.725, 0.73, 0.735, 0.74, 0.745),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) + 
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.72, 0.745))

MAF > 0.01 - LD pruning = 0.3 (fenêtre de 1749 SNPs et pas de 175 bp) - 3848 SNPs

setwd("~/Documents/Stage_NB/data/SeqApiPop_629_SNPsBeeMuSe")

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:30) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('SeqApiPop_629_SNPsBeeMuSe_filtered_maf001_LD03_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

#box plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.765, 0.77, 0.775, 0.78, 0.785, 0.79, 0.795, 0.8, 0.805, 0.81, 0.815,0.82,0.825,0.83,0.835,0.84),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.765, 0.84))

#jitter plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.765, 0.77, 0.775, 0.78, 0.785, 0.79, 0.795, 0.8, 0.805, 0.81, 0.815,0.82,0.825,0.83,0.835,0.84),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.765, 0.84))

MAF > 0.01 - LD pruning = 0.1 (fenêtre de 50 SNPs et pas de 10 bp) - 1055 SNPs

setwd("~/Documents/Stage_NB/data/SeqApiPop_629_SNPsBeeMuSe")

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:30) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('SeqApiPop_629_SNPsBeeMuSe_filtered_maf001_LD03_default_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure 
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

#box plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.755, 0.76, 0.765, 0.77, 0.775, 0.78, 0.785, 0.79, 0.795, 0.8, 0.805, 0.81, 0.815,0.82),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.755, 0.82))

#jitter plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.755, 0.76, 0.765, 0.77, 0.775, 0.78, 0.785, 0.79, 0.795, 0.8, 0.805, 0.81, 0.815,0.82),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.755, 0.82))

# Filtrer les données pour inclure uniquement les valeurs de K de 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
filtered_data <- merge_cv_error[merge_cv_error[,1] %in% c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12), ]

# Créer le jitter plot avec les données filtrées
ggplot(filtered_data, aes(x = factor(V1), y = V2)) +
  geom_hline(
    yintercept = c(0.755, 0.76, 0.765, 0.77, 0.775, 0.78, 0.785),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) + 
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.755, 0.785))

SeqApiPop - 561 échantillons - SNPsBeeMuSe filtered

No LD pruning - 10030 SNPs

setwd("~/Documents/Stage_NB/data/SeqApiPop_561_SNPsBeeMuSe")

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:30) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('SeqApiPop_561_SNPsBeeMuSe_filtered_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure - data - frame
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

#box plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.755,0.76,0.765,0.77,0.775,0.78,0.785,0.79,0.795,0.8,0.805,0.81,0.815,0.82,0.825,0.83,0.835,0.84),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.755, 0.84))

#jitter plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.755,0.76,0.765,0.77,0.775,0.78,0.785,0.79,0.795,0.8,0.805,0.81,0.815,0.82,0.825,0.83,0.835,0.84),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.755, 0.84))

# Filtrer les données pour inclure uniquement les valeurs de K de 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
filtered_data <- merge_cv_error[merge_cv_error[,1] %in%  c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12), ]

# Créer le jitter plot avec les données filtrées
ggplot(filtered_data, aes(x = factor(V1), y = V2)) +
  geom_hline(
    yintercept = c(0.72, 0.725, 0.73, 0.735, 0.74, 0.745,0.75,0.755,0.76,0.765,0.77,0.775),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) + 
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.758, 0.775))

MAF > 0.01 - LD pruning = 0.3 (fenêtre de 1749 SNPS et pas de 175 bp) - 3848 SNPs

setwd("~/Documents/Stage_NB/data/SeqApiPop_561_SNPsBeeMuse_LD03")

liste_de_donnees <- list()

for (i in 1:30) {
  merge_cv_error <- paste0('SeqApiPop_561_SNPsBeeMuSe_filtered_maf001_LD03_pruned_', i, '.cv.error')
  donnees <- read.table(merge_cv_error, header = FALSE)
  liste_de_donnees[[i]] <- donnees
}

donnees_combinees <- do.call(rbind, liste_de_donnees)
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)

#box plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.775, 0.78, 0.785, 0.79,0.795,0.8,0.805,0.81,0.815,0.82,0.825,0.83,0.835,0.84,0.845,0.85),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.775, 0.85))

#jitter plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.775, 0.78, 0.785, 0.79,0.795,0.8,0.805,0.81,0.815,0.82,0.825,0.83,0.835,0.84,0.845,0.85),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.775, 0.85))

MAF > 0.01 - LD pruning = 0.1 (fenêtre de 50 SNPS et pas de 10 bp) - 1055 SNPs

setwd("~/Documents/Stage_NB/data/SeqApiPop_561_SNPsBeeMuSe_LD_default")

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:30) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('SeqApiPop_561_SNPsBeeMuSe_filtered_maf001_LD03_default_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure 
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

#box plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.765,0.77, 0.775, 0.78, 0.785, 0.79,0.795,0.8,0.805,0.81,0.815,0.82,0.825,0.83),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.765, 0.83))

#jitter plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.765,0.77, 0.775, 0.78, 0.785, 0.79,0.795,0.8,0.805,0.81,0.815,0.82,0.825,0.83),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.765, 0.83))

# Valeurs de K à inclure dans le graphique
k_values <- c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12)

# Filtrer les données pour inclure uniquement les valeurs de K spécifiées
filtered_data <- merge_cv_error[merge_cv_error[,1] %in% k_values, ]

# Créer le jitter plot avec les données filtrées
ggplot(filtered_data, aes(x = factor(V1), y = V2)) +
  geom_hline(
    yintercept = c(0.765, 0.77, 0.775, 0.78, 0.785, 0.79),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) + 
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.767, 0.79))

BeeMuSe - 12000 SNPs

K_new <- c(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70)
CV_new <- c(0.48943, 0.46307, 0.44338, 0.42781, 0.41962, 0.40411, 0.40055, 0.38682, 0.38044, 0.37802, 0.37102, 0.36619, 0.36130, 0.35863, 0.35431, 0.35162, 0.34960, 0.34742, 0.34640, 0.34555, 0.34281, 0.34187, 0.34055, 0.34081, 0.33861, 0.33930, 0.33562, 0.33588, 0.33543, 0.33518, 0.33262, 0.33176, 0.33216, 0.33019, 0.33013, 0.32908, 0.33166, 0.32875, 0.32961, 0.32977, 0.32796, 0.32878, 0.32828, 0.32772, 0.32533, 0.32954, 0.32862, 0.32522, 0.32771, 0.32770, 0.32558, 0.33150, 0.33390, 0.32462, 0.32938, 0.32893, 0.33254, 0.32727, 0.32883, 0.33015, 0.33254, 0.33260, 0.33633, 0.33744, 0.33756, 0.33277, 0.33579)

# Trouver l'indice de la valeur la plus basse de CV
indice_min_new <- which.min(CV_new)

# Créer le graphique
plot(K_new, CV_new, type="l", col="black", xlab="K", ylab="CV", main="CV error plot - BeeMuSe  3848 SNPs")

# Ajouter la ligne avec le trait hachuré bleu pour la valeur la plus basse
abline(h=CV_new[indice_min_new], col="blue", lty=2)

# Ajouter la droite verticale rouge pour la valeur la plus basse de CV
abline(v=K_new[indice_min_new], col="red", lty=2)

# Ajouter les lignes de grille horizontales à intervalles de 0.01
abline(h=seq(0, 1, by=0.01), col="lightgray")

# Ajouter la légende avec la valeur de K correspondant à la plus basse erreur CV
legend("topright", legend=sprintf("lowest CV error: %.5f (K = %d)", CV_new[indice_min_new], K_new[indice_min_new]), col="blue", lty=2, cex=0.8)

Merged Data - BeeMuSe SeqApiPop

MAF > 0.01 - LD pruning = 0.3 (fenêtre de 1749 SNPS et pas de 175 bp)

# Valeurs CV - Admixture non supervisée
K <- c(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48 ,49, 50)
CV <- c(0.66344, 0.62032, 0.60790, 0.60015, 0.58943, 0.58458, 0.58509, 0.57812, 0.57116, 0.56933, 0.56023, 0.56145, 0.55645, 0.55416, 0.55222, 0.54907, 0.54576, 0.54370, 0.54270, 0.54220, 0.54162, 0.54375, 0.54147, 0.53958, 0.54064, 0.53829, 0.54184, 0.53940, 0.53965, 0.54062, 0.53549, 0.54011, 0.53806, 0.53622, 0.53630, 0.53986, 0.54062, 0.53707, 0.54252, 0.53593, 0.53901, 0.54513, 0.54334, 0.54278, 0.54128, 0.54482, 0.54184, 0.55066, 0.55151
)

# Trouver l'indice de la valeur la plus basse de CV
indice_min <- which.min(CV)

# Créer le graphique
plot(K, CV, type="l", col="black", xlab="K", ylab="CV", main="CV error plot - Merged BeeMuSe SeqApiPop 3848 SNPs")

# Ajouter la ligne avec le trait hachuré bleu pour la valeur la plus basse
abline(h=CV[indice_min], col="blue", lty=2)

# Ajouter la droite verticale rouge pour la valeur la plus basse de CV
abline(v=K[indice_min], col="red", lty=2)

# Ajouter les lignes de grille horizontales à intervalles de 0.01
abline(h=seq(0, 1, by=0.01), col="lightgray")

# Ajouter la légende
legend("topright", legend=sprintf("lowest CV error: %.5f (K = %d)", CV[indice_min], K[indice_min]), col="blue", lty=2, cex=0.8)

# Nouvelles données
K <- c(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50)
CV <- c(0.66350, 0.62020, 0.60778, 0.59670, 0.58968, 0.58467, 0.58258, 0.57804, 0.57563, 0.56626, 0.56028, 0.55808, 0.55537, 0.55140, 0.55279, 0.54843, 0.54649, 0.54583, 0.54412, 0.54363, 0.54187, 0.54172, 0.54028, 0.53822, 0.54265, 0.53707, 0.53598, 0.53746, 0.54359, 0.53759, 0.53651, 0.53563, 0.53614, 0.53701, 0.53993, 0.53787, 0.53929, 0.53656, 0.53982, 0.54181, 0.54237, 0.53793, 0.54851, 0.54505, 0.54506, 0.54203, 0.54622, 0.55210, 0.55177)

# Trouver l'indice de la valeur la plus basse de CV
indice_min <- which.min(CV)

# Créer le graphique
plot(K, CV, type="l", col="black", xlab="K", ylab="CV", main="CV error plot - Merged BeeMuSe SeqApiPop 3848 SNPs")

# Ajouter la ligne avec le trait hachuré bleu pour la valeur la plus basse
abline(h=CV[indice_min], col="blue", lty=2)

# Ajouter la droite verticale rouge pour la valeur la plus basse de CV
abline(v=K[indice_min], col="red", lty=2)

# Ajouter les lignes de grille horizontales à intervalles de 0.01
abline(h=seq(0, 1, by=0.01), col="lightgray")

# Ajouter la légende
legend("topright", legend=sprintf("lowest CV error: %.5f (K = %d)", CV[indice_min], K[indice_min]), col="blue", lty=2, cex=0.8)

629 échantillons - K2 à K9 - 30 exécutions

setwd("~/Documents/Stage_NB/data/merged_BeeMuSe_SeqApiPop_629_filtered_maf001_LD03")

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:30) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('merged_BeeMuSe_SeqApiPop_629_filtered_maf001_LD03_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure 
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

#box plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.55, 0.67))

#jitter plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.55, 0.67))

561 échantillons - K2 à K9 - 30 exécutions

setwd("~/Documents/Stage_NB/data/merged_data_3848_561_not_supervised")

liste_de_donnees <- list()
for (i in 1:30) {
  merge_cv_error <- paste0('merged_BeeMuSe_SeqApiPop_561_filtered_maf001_LD03_', i, '.cv.error')
  donnees <- read.table(merge_cv_error, header = FALSE)
  liste_de_donnees[[i]] <- donnees
}

donnees_combinees <- do.call(rbind, liste_de_donnees)
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)

#box plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.54,0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65,0.66,0.67),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.55, 0.67))

#jitter plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.54,0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65,0.66,0.67),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.55, 0.67))

MAF > 0.01 - LD pruning = 0.1 (fenêtre de 50 SNPS et pas de 10 bp)

561 échantillons - K2 à K9 - 30 exécutions

setwd("~/Documents/Stage_NB/data/merged_data_1055_561_not_supervised")

liste_de_donnees <- list()
for (i in 1:30) {
  merge_cv_error <- paste0('merged_BeeMuSe_SeqApiPop_561_filtered_MAF001_LD_default_', i, '.cv.error')
  donnees <- read.table(merge_cv_error, header = FALSE)
  liste_de_donnees[[i]] <- donnees
}

donnees_combinees <- do.call(rbind, liste_de_donnees)
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)

#box plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.54,0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.54, 0.65))

#jitter plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.54,0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.54, 0.65))

Admixture supervisée

Merged Data - BeeMuSe SeqApiPop

629 échantillons - LD pruning = 0.1 (fenêtre de 50 et pas de 10 bp) - 1055 SNPs

K = 5
setwd("~/Documents/Stage_NB/data/merged_data_3848_629_supervised")

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:30) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('merged_BeeMuSe_SeqApiPop_629_filtered_MAF001_LD_default_K5_95_supervised_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure 
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

#box plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.616,0.617,0.618,0.619,0.62,0.621,0.622),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.616, 0.622))

#jitter plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.616,0.617,0.618,0.619,0.62,0.621,0.622),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.616, 0.622))

K = 6
setwd("~/Documents/Stage_NB/data/merged_data_3848_629_supervised")

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:30) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('merged_BeeMuSe_SeqApiPop_629_filtered_MAF001_LD_default_K6_95_supervised_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure 
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

#box plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.611,0.612,0.613,0.614,0.615,0.616,0.617),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.611, 0.617))

#jitter plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.611,0.612,0.613,0.614,0.615,0.616,0.617),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.611, 0.617))

561 échantillons - LD pruning = 0.3 (fenêtre de 1749 et pas de 175 bp) K = 5 - 3848 SNPs

setwd("~/Documents/Stage_NB/data/merged_data_3848_561_supervised")

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:30) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('merged_BeeMuSe_SeqApiPop_561_filtered_MAF001_LD_default_K5_95_supervised_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure 
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

#box plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.616,0.617,0.618,0.619,0.62),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.616, 0.62))

#jitter plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.616,0.617,0.618,0.619,0.62),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.616, 0.62))

K = 6
setwd("~/Documents/Stage_NB/data/merged_data_3848_561_supervised")

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:30) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('merged_BeeMuSe_SeqApiPop_561_filtered_MAF001_LD_default_K6_95_supervised_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure 
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

#box plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.61,0.611,0.612,0.613,0.614,0.615),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.61, 0.615))

#jitter plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.61,0.611,0.612,0.613,0.614,0.615),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.61, 0.615))

561 échantillons - LD pruning = 0.1 (fenêtre de 50 et pas de 10 bp) - 1055 SNPs

K = 5
setwd("~/Documents/Stage_NB/data/merged_data_1055_561_supervised")

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:30) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('merged_BeeMuSe_SeqApiPop_561_filtered_MAF001_LD_default_K5_90_supervised_', i, '.cv.error')
  #merge_cv_error <- paste0('merged_BeeMuSe_SeqApiPop_561_filtered_MAF001_LD_default_K5_95_supervised_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure 
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

#box plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.59, 0.591, 0.592, 0.593,0.594,0.595,0.596,0.597,0.598,0.599),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.59, 0.599))

#jitter plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.59, 0.591, 0.592, 0.593,0.594,0.595,0.596,0.597,0.598,0.599),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.59, 0.599))

K = 6
setwd("~/Documents/Stage_NB/data/merged_data_1055_561_supervised")

# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()

# Étape 2: Parcourir les fichiers
for (i in 1:30) {
  # Générer le nom du fichier
  merge_cv_error <- paste0('merged_BeeMuSe_SeqApiPop_561_filtered_MAF001_LD_default_K6_90_supervised_', i, '.cv.error')
  #merge_cv_error <- paste0('merged_BeeMuSe_SeqApiPop_561_filtered_MAF001_LD_default_K5_95_supervised_', i, '.cv.error')
  # Lire les données du fichier
  donnees <- read.table(merge_cv_error, header = FALSE)
  # Ajouter les données à la liste
  liste_de_donnees[[i]] <- donnees
}

# Étape 3: Combiner les données en une seule structure 
donnees_combinees <- do.call(rbind, liste_de_donnees)

# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)

merge_cv_error <- read.table("merge_cv_error", header = F)

#box plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.586,0.587,0.588,0.589,0.59, 0.591, 0.592, 0.593,0.594,0.595),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.587, 0.595))

#jitter plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
  geom_hline(
    yintercept = c(0.587,0.588,0.589,0.59, 0.591, 0.592, 0.593,0.594,0.595),
    color = "black",
    linetype = "solid",
    size = 0.5
  ) +
  geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
  labs(title = "Cross-validation Error Plot",
       x = "K",
       y = "CV") +
  scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
  theme_minimal() +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  coord_cartesian(ylim = c(0.587, 0.595))

Admixture supervisé - Création du fichier liste individu / population

561 échantillons - MAF > 0.01 - LD pruning = 0.3 (fenêtre de 1749 SNPs et pas de 149 bp) - 3848 SNPs

K = 3
setwd("~/Documents/Stage_NB/data/Qfiles/SeqApiPop_561_maf001_LD03")

Q_3_561 <- read.table("SeqApiPop_561_maf001_LD03_pruned.3.r10.Q", header = FALSE)

colnames(Q_3_561)[colnames(Q_3_561) == "V1"] <- "Vert"
colnames(Q_3_561)[colnames(Q_3_561) == "V2"] <- "Noir"
colnames(Q_3_561)[colnames(Q_3_561) == "V3"] <- "Orange"

categories <- character(nrow(Q_3_561))

# Initialisation du vecteur de catégories
categories <- rep("-", nrow(Q_3_561))

# Itérer à travers chaque ligne
for (i in 1:nrow(Q_3_561)) {
  # Vérifier si aucune valeur dans la ligne ne dépasse 0.95
  if (all(Q_3_561[i,] <= 0.95)) {
    categories[i] <- "-"
  } else {
    # Vérifier quelle colonne a la valeur supérieure à 0.95
    if (Q_3_561[i,1] > 0.95) {
      categories[i] <- "Vert"
    } else if (Q_3_561[i,2] > 0.95) {
      categories[i] <- "Noir"
    } else if (Q_3_561[i,3] > 0.95) {
      categories[i] <- "Orange"
    }
  }
}

write(categories, file = "output_list_K3_561_95.txt")

### On fait de même pour un seuil de 0.90 cette fois
categories <- character(nrow(Q_3_561))

# Initialisation du vecteur de catégories
categories <- rep("-", nrow(Q_3_561))

# Itérer à travers chaque ligne
for (i in 1:nrow(Q_3_561)) {
  # Vérifier si aucune valeur dans la ligne ne dépasse 0.9
  if (all(Q_3_561[i,] <= 0.9)) {
    categories[i] <- "-"
  } else {
    # Vérifier quelle colonne a la valeur supérieure à 0.9
    if (Q_3_561[i,1] > 0.9) {
      categories[i] <- "Vert"
    } else if (Q_3_561[i,2] > 0.9) {
      categories[i] <- "Noir"
    } else if (Q_3_561[i,3] > 0.9) {
      categories[i] <- "Orange"
    }
  }
}

write(categories, file = "output_list_K3_561_90.txt")
setwd("~/Documents/Stage_NB/data/Qfiles")

# Les individus du projet BeeMuSe ont leurs fonds génétiques inconnus, on indique cela par '-'
# seuil 0.95
output_list_K3_561_95_merged <- readLines("output_list_K3_561_95_merged.txt")
output_list_K3_561_95_merged <- gsub("Beemuse", "-", output_list_K3_561_95_merged)
writeLines(output_list_K3_561_95_merged, "merged_data_K3_561_95.pop")

# seuil 0.90
output_list_K3_561_90_merged <- readLines("output_list_K3_561_90_merged.txt")
output_list_K3_561_90_merged <- gsub("Beemuse", "-", output_list_K3_561_90_merged)
writeLines(output_list_K3_561_90_merged, "merged_data_K3_561_90.pop")
# Nombre d'individus de référence pour chaque population pure ou de fond génétique inconnu, indiqué dans le fichier .pop lors de l'admixture supervisée
# seuil 0.95
texte_complet <- paste(output_list_K3_561_95_merged, collapse = " ")
K3_95 <- unlist(strsplit(texte_complet, "\\s+"))
nombre_apparitions <- table(K3_95)
print(nombre_apparitions)
## K3_95
##      -   Noir Orange   Vert 
##   1081     68    143     17
# seuil 0.90
texte_complet <- paste(output_list_K3_561_90_merged, collapse = " ")
K3_90 <- unlist(strsplit(texte_complet, "\\s+"))
nombre_apparitions <- table(K3_90)
print(nombre_apparitions)
## K3_90
##      -   Noir Orange   Vert 
##   1033     84    175     17
K = 5
setwd("~/Documents/Stage_NB/data/SeqApiPop_561_maf001_LD03")

labels <- read.csv('~/Documents/Stage_NB/data/SeqApiPop_labels.csv')
samples_561 <- read.table("SeqApiPop_561_maf001_LD03_pruned.fam", header = FALSE)

samples_561 <- samples_561[, 1:2] 
colnames(samples_561)[colnames(samples_561) == "V1"] <- "name"

merged_labels_samples_561 <- merge(labels, samples_561,  by = 'name')

Label_samples_561 <- subset(merged_labels_samples_561, select = "Label") 

writeLines(as.character(Label_samples_561$Label), "Label_samples_561.txt")
setwd("~/Documents/Stage_NB/data/Qfiles/SeqApiPop_561_maf001_LD03")

Q_5_561 <- read.table("SeqApiPop_561_maf001_LD03_pruned.5.r10.Q", header = FALSE)

colnames(Q_5_561)[colnames(Q_5_561) == "V1"] <- "Bleu"
colnames(Q_5_561)[colnames(Q_5_561) == "V2"] <- "Jaune"
colnames(Q_5_561)[colnames(Q_5_561) == "V3"] <- "Vert"
colnames(Q_5_561)[colnames(Q_5_561) == "V4"] <- "Orange"
colnames(Q_5_561)[colnames(Q_5_561) == "V5"] <- "Noir"

categories <- character(nrow(Q_5_561))

# Initialisation du vecteur de catégories
categories <- rep("-", nrow(Q_5_561))

# Itérer à travers chaque ligne
for (i in 1:nrow(Q_5_561)) {
  # Vérifier si aucune valeur dans la ligne ne dépasse 0.95
  if (all(Q_5_561[i,] <= 0.95)) {
    categories[i] <- "-"
  } else {
    # Vérifier quelle colonne a la valeur supérieure à 0.95
    if (Q_5_561[i,1] > 0.95) {
      categories[i] <- "Bleu"
    } else if (Q_5_561[i,2] > 0.95) {
      categories[i] <- "Jaune"
    } else if (Q_5_561[i,3] > 0.95) {
      categories[i] <- "Vert"
    } else if (Q_5_561[i,4] > 0.95) {
      categories[i] <- "Orange"
    } else if (Q_5_561[i,5] > 0.95) {
      categories[i] <- "Noir"
    }
  }
}

write(categories, file = "output_list_K5_561.txt")
setwd("~/Documents/Stage_NB/data/Qfiles")

output_list_K5_561_merged <- readLines("output_list_K5_561_merged.txt")
output_list_K5_561_merged <- gsub("Beemuse", "-", output_list_K5_561_merged)
writeLines(output_list_K5_561_merged, "merged_data_K5_561.pop")
K = 6
setwd("~/Documents/Stage_NB/data/Qfiles/SeqApiPop_561_maf001_LD03")

Q_6_561 <- read.table("SeqApiPop_561_maf001_LD03_pruned.6.r13.Q", header = FALSE)

colnames(Q_6_561)[colnames(Q_6_561) == "V1"] <- "Rouge"
colnames(Q_6_561)[colnames(Q_6_561) == "V2"] <- "Bleu"
colnames(Q_6_561)[colnames(Q_6_561) == "V3"] <- "Orange"
colnames(Q_6_561)[colnames(Q_6_561) == "V4"] <- "Vert"
colnames(Q_6_561)[colnames(Q_6_561) == "V5"] <- "Jaune"
colnames(Q_6_561)[colnames(Q_6_561) == "V6"] <- "Noir"

categories <- character(nrow(Q_6_561))

# Initialisation du vecteur de catégories
categories <- rep("-", nrow(Q_6_561))

# Itérer à travers chaque ligne
for (i in 1:nrow(Q_6_561)) {
  # Vérifier si aucune valeur dans la ligne ne dépasse 0.95
  if (all(Q_6_561[i,] <= 0.95)) {
    categories[i] <- "-"
  } else {
    # Vérifier quelle colonne a la valeur supérieure à 0.95
    if (Q_6_561[i,1] > 0.95) {
      categories[i] <- "Rouge"
    } else if (Q_6_561[i,2] > 0.95) {
      categories[i] <- "Bleu"
    } else if (Q_6_561[i,3] > 0.95) {
      categories[i] <- "Orange"
    } else if (Q_6_561[i,4] > 0.95) {
      categories[i] <- "Vert"
    } else if (Q_6_561[i,5] > 0.95) {
      categories[i] <- "Jaune"
    } else if (Q_6_561[i,6] > 0.95) {
      categories[i] <- "Noir"
    }
  }
}

write(categories, file = "output_list_K6_561.txt")
setwd("~/Documents/Stage_NB/data/Qfiles")

output_list_K6_561_merged <- readLines("output_list_K6_561_merged.txt")
output_list_K6_561_merged <- gsub("Beemuse", "-", output_list_K6_561_merged)
writeLines(output_list_K6_561_merged, "merged_data_K6_561.pop")

561 échantillons - MAF > 0.01 - LD pruning = 0.1 (fenêtre de 50 SNPs et pas de 10 bp) - 1055 SNPs

K = 3
setwd("~/Documents/Stage_NB/data/Qfiles/SeqApiPop_561_LD03_default_1055")

Q_3_561_default <- read.table("SeqApiPop_561_SNPsBeeMuSe_filtered_maf001_LD03_default_pruned.3.r0.Q", header = FALSE)

colnames(Q_3_561_default)[colnames(Q_3_561_default) == "V1"] <- "Noir"
colnames(Q_3_561_default)[colnames(Q_3_561_default) == "V2"] <- "Vert"
colnames(Q_3_561_default)[colnames(Q_3_561_default) == "V3"] <- "Orange"

categories <- character(nrow(Q_3_561_default))

# Initialisation du vecteur de catégories
categories <- rep("-", nrow(Q_3_561_default))

# Itérer à travers chaque ligne
for (i in 1:nrow(Q_3_561_default)) {
  # Vérifier si aucune valeur dans la ligne ne dépasse 0.95
  if (all(Q_3_561_default[i,] <= 0.95)) {
    categories[i] <- "-"
  } else {
    # Vérifier quelle colonne a la valeur supérieure à 0.95
    if (Q_3_561_default[i,1] > 0.95) {
      categories[i] <- "Noir"
    } else if (Q_3_561_default[i,2] > 0.95) {
      categories[i] <- "Vert"
    } else if (Q_3_561_default[i,3] > 0.95) {
      categories[i] <- "Orange"
    } 
  }
}

write(categories, file = "output_list_K3_561_LD_default_95.txt")

### On fait de même pour un seuil de 0.90 cette fois
categories <- character(nrow(Q_3_561_default))

# Initialisation du vecteur de catégories
categories <- rep("-", nrow(Q_3_561_default))

# Itérer à travers chaque ligne
for (i in 1:nrow(Q_3_561_default)) {
  # Vérifier si aucune valeur dans la ligne ne dépasse 0.9
  if (all(Q_3_561_default[i,] <= 0.9)) {
    categories[i] <- "-"
  } else {
    # Vérifier quelle colonne a la valeur supérieure à 0.9
    if (Q_3_561_default[i,1] > 0.9) {
      categories[i] <- "Noir"
    } else if (Q_3_561_default[i,2] > 0.9) {
      categories[i] <- "Vert"
    } else if (Q_3_561_default[i,3] > 0.9) {
      categories[i] <- "Orange"
    } 
  }
}

write(categories, file = "output_list_K3_561_LD_default_90.txt")
setwd("~/Documents/Stage_NB/data/Qfiles")

# Les individus du projet BeeMuSe ont leurs fonds génétiques inconnus, on indique cela par '-'
# seuil 0.95
output_list_K3_561_default_95_merged <- readLines("output_list_K3_561_LD_default_95_merged.txt")
output_list_K3_561_default_95_merged <- gsub("Beemuse", "-", output_list_K3_561_default_95_merged)
writeLines(output_list_K3_561_default_95_merged, "merged_data_K3_561_LD_default_95.pop")

# seuil 0.90
output_list_K3_561_default_90_merged <- readLines("output_list_K3_561_LD_default_90_merged.txt")
output_list_K3_561_default_90_merged <- gsub("Beemuse", "-", output_list_K3_561_default_90_merged)
writeLines(output_list_K3_561_default_90_merged, "merged_data_K3_561_LD_default_90.pop")
# Nombre d'individus de référence pour chaque population pure ou de fond génétique inconnu, indiqué dans le fichier .pop lors de l'admixture supervisée
# seuil 0.95
texte_complet <- paste(output_list_K3_561_default_95_merged, collapse = " ")
K3_95 <- unlist(strsplit(texte_complet, "\\s+"))
nombre_apparitions_95 <- table(K3_95)
print(nombre_apparitions_95)
## K3_95
##      -   Noir Orange   Vert 
##   1176     55     63     15
# seuil 0.90
texte_complet <- paste(output_list_K3_561_default_90_merged, collapse = " ")
K3_90 <- unlist(strsplit(texte_complet, "\\s+"))
nombre_apparitions_90 <- table(K3_90)
print(nombre_apparitions_90)
## K3_90
##      -   Noir Orange   Vert 
##   1096     77    119     17
# Définition des valeurs et labels pour le seuil 0.95
valeurs_95 <- c(428, 55, 63, 15)
labels_95 <- c("Unknown", "Black", "Orange", "Green")
df_95 <- data.frame(Reference_Individuals = labels_95, values = valeurs_95)

# Création du graphique (Pie Chart) pour le seuil 0.95
ggplot(df_95, aes(x = "", y = values, fill = Reference_Individuals)) +
  geom_col(width = 1, color = "white") +  
  geom_label(aes(label = values),
             color = "white",
             position = position_stack(vjust = 0.5),
             show.legend = FALSE,
             size = 5.5,             
             fontface = "bold") +  
  coord_polar(theta = "y") +
  scale_fill_manual(values = c("black", "#018a16", "#FE9001", "lightgrey")) + 
  theme_void() +
  ggtitle("Pie Chart - Seuil 0.95")

# Définition des valeurs et labels pour le seuil 0.90
valeurs_90 <- c(348, 77, 119, 17)
labels_90 <- c("Unknown", "Black", "Orange", "Green")
df_90 <- data.frame(Reference_Individuals = labels_95, values = valeurs_90)

# Création du graphique (Pie Chart) pour le seuil 0.90
ggplot(df_90, aes(x = "", y = values, fill = Reference_Individuals)) +
  geom_col(width = 1, color = "white") +  
  geom_label(aes(label = values),
             color = "white",
             position = position_stack(vjust = 0.5),
             show.legend = FALSE,
             size = 6,             
             fontface = "bold") +  
  coord_polar(theta = "y") +
  scale_fill_manual(values = c("black", "#018a16", "#FE9001", "lightgrey")) + 
  theme_void() +
  ggtitle("Pie Chart - Seuil 0.90")

K = 5
setwd("~/Documents/Stage_NB/data/Qfiles/SeqApiPop_561_LD03_default_1055")

Q_5_561_default <- read.table("SeqApiPop_561_SNPsBeeMuSe_filtered_maf001_LD03_default_pruned.5.r22.Q", header = FALSE)

colnames(Q_5_561_default)[colnames(Q_5_561_default) == "V1"] <- "Rouge"
colnames(Q_5_561_default)[colnames(Q_5_561_default) == "V2"] <- "Vert"
colnames(Q_5_561_default)[colnames(Q_5_561_default) == "V3"] <- "Noir"
colnames(Q_5_561_default)[colnames(Q_5_561_default) == "V4"] <- "Orange"
colnames(Q_5_561_default)[colnames(Q_5_561_default) == "V5"] <- "Jaune"

categories <- character(nrow(Q_5_561_default))
categories <- rep("-", nrow(Q_5_561_default))

for (i in 1:nrow(Q_5_561_default)) {
  # Vérifier si aucune valeur dans la ligne ne dépasse 0.95
  if (all(Q_5_561_default[i,] <= 0.95)) {
    categories[i] <- "-"
  } else {
    # Vérifier quelle colonne a la valeur supérieure à 0.95
    if (Q_5_561_default[i,1] > 0.95) {
      categories[i] <- "Rouge"
    } else if (Q_5_561_default[i,2] > 0.95) {
      categories[i] <- "Vert"
    } else if (Q_5_561_default[i,3] > 0.95) {
      categories[i] <- "Noir"
    } else if (Q_5_561_default[i,4] > 0.95) {
      categories[i] <- "Orange"
    } else if (Q_5_561_default[i,5] > 0.95) {
      categories[i] <- "Jaune"
    } 
  }
}

write(categories, file = "output_list_K5_561_LD_default_95.txt")

# De même pour un seuil de 0.90
categories <- character(nrow(Q_5_561_default))
categories <- rep("-", nrow(Q_5_561_default))

for (i in 1:nrow(Q_5_561_default)) {
  # Vérifier si aucune valeur dans la ligne ne dépasse 0.9
  if (all(Q_5_561_default[i,] <= 0.9)) {
    categories[i] <- "-"
  } else {
    # Vérifier quelle colonne a la valeur supérieure à 0.9
    if (Q_5_561_default[i,1] > 0.9) {
      categories[i] <- "Rouge"
    } else if (Q_5_561_default[i,2] > 0.9) {
      categories[i] <- "Vert"
    } else if (Q_5_561_default[i,3] > 0.9) {
      categories[i] <- "Noir"
    } else if (Q_5_561_default[i,4] > 0.9) {
      categories[i] <- "Orange"
    } else if (Q_5_561_default[i,5] > 0.9) {
      categories[i] <- "Jaune"
    } 
  }
}

write(categories, file = "output_list_K5_561_LD_default_90.txt")
setwd("~/Documents/Stage_NB/data/Qfiles")

# Les individus du projet BeeMuSe ont leurs fonds génétiques inconnus, on indique cela par '-'
# seuil 0.95
output_list_K5_561_default_95_merged <- readLines("output_list_K5_561_LD_default_95_merged.txt")
output_list_K5_561_default_95_merged <- gsub("Beemuse", "-", output_list_K5_561_default_95_merged)
writeLines(output_list_K5_561_default_95_merged, "merged_data_K5_561_LD_default_95.pop")

# seuil 0.90
output_list_K5_561_default_90_merged <- readLines("output_list_K5_561_LD_default_90_merged.txt")
output_list_K5_561_default_90_merged <- gsub("Beemuse", "-", output_list_K5_561_default_90_merged)
writeLines(output_list_K5_561_default_90_merged, "merged_data_K5_561_LD_default_90.pop")
# Nombre d'individus de référence pour chaque population pure ou de fond génétique inconnu, indiqué dans le fichier .pop lors de l'admixture supervisée
# seuil 0.95
texte_complet <- paste(output_list_K5_561_default_95_merged, collapse = " ")
K5_95 <- unlist(strsplit(texte_complet, "\\s+"))
nombre_apparitions <- table(K5_95)
print(nombre_apparitions)
## K5_95
##      -  Jaune   Noir Orange  Rouge   Vert 
##   1178     21     49     24     22     15
# seuil 0.90
texte_complet <- paste(output_list_K5_561_default_90_merged, collapse = " ")
K5_90 <- unlist(strsplit(texte_complet, "\\s+"))
nombre_apparitions <- table(K5_90)
print(nombre_apparitions)
## K5_90
##      -  Jaune   Noir Orange  Rouge   Vert 
##   1121     23     74     47     27     17
# Définition des valeurs et labels pour le seuil 0.95
valeurs_95 <- c(430, 21, 49, 24, 22, 15)
labels_95 <- c("Unknown", "Yellow", "Black", "Orange", "Red", "Green")
df_95 <- data.frame(Reference_Individuals = labels_95, values = valeurs_95)

# Création du graphique (Pie Chart) pour le seuil 0.95
ggplot(df_95, aes(x = "", y = values, fill = Reference_Individuals)) +
  geom_col(width = 1, color = "white") +  
  geom_label(aes(label = values),
             color = "white",
             position = position_stack(vjust = 0.5),
             show.legend = FALSE,
             size = 5.5,             
             fontface = "bold") +  
  coord_polar(theta = "y") +
  scale_fill_manual(values = c("black", "#018a16", "#FE9001", "#FE0101", "lightgrey", "#EDFE01")) + 
  theme_void() +
  ggtitle("Pie Chart - Seuil 0.95")

# Définition des valeurs et labels pour le seuil 0.90
valeurs_90 <- c(373, 23, 74, 47, 27, 17)
labels_90 <- c("Unknown", "Yellow", "Black", "Orange", "Red", "Green")
df_90 <- data.frame(Reference_Individuals = labels_90, values = valeurs_90)

# Création du graphique (Pie Chart) pour le seuil 0.90
ggplot(df_90, aes(x = "", y = values, fill = Reference_Individuals)) +
  geom_col(width = 1, color = "white") +  
  geom_label(aes(label = values),
             color = "white",
             position = position_stack(vjust = 0.5),
             show.legend = FALSE,
             size = 6,             
             fontface = "bold") +  
  coord_polar(theta = "y") +
  scale_fill_manual(values = c("black", "#018a16", "#FE9001", "#FE0101", "lightgrey", "#EDFE01")) + 
  theme_void() +
  ggtitle("Pie Chart - Seuil 0.90")

K = 6
setwd("~/Documents/Stage_NB/data/Qfiles/SeqApiPop_561_LD03_default_1055")

Q_6_561_default <- read.table("SeqApiPop_561_SNPsBeeMuSe_filtered_maf001_LD03_default_pruned.6.r23.Q", header = FALSE)

colnames(Q_6_561_default)[colnames(Q_6_561_default) == "V1"] <- "Bleu"
colnames(Q_6_561_default)[colnames(Q_6_561_default) == "V2"] <- "Rouge"
colnames(Q_6_561_default)[colnames(Q_6_561_default) == "V3"] <- "Noir"
colnames(Q_6_561_default)[colnames(Q_6_561_default) == "V4"] <- "Jaune"
colnames(Q_6_561_default)[colnames(Q_6_561_default) == "V5"] <- "Orange"
colnames(Q_6_561_default)[colnames(Q_6_561_default) == "V6"] <- "Vert"

categories <- character(nrow(Q_6_561_default))
categories <- rep("-", nrow(Q_6_561_default))

for (i in 1:nrow(Q_6_561_default)) {
  # Vérifier si aucune valeur dans la ligne ne dépasse 0.95
  if (all(Q_6_561_default[i,] <= 0.95)) {
    categories[i] <- "-"
  } else {
    # Vérifier quelle colonne a la valeur supérieure à 0.95
    if (Q_6_561_default[i,1] > 0.95) {
      categories[i] <- "Bleu"
    } else if (Q_6_561_default[i,2] > 0.95) {
      categories[i] <- "Rouge"
    } else if (Q_6_561_default[i,3] > 0.95) {
      categories[i] <- "Noir"
    } else if (Q_6_561_default[i,4] > 0.95) {
      categories[i] <- "Jaune"
    } else if (Q_6_561_default[i,5] > 0.95) {
      categories[i] <- "Orange"
    } else if (Q_6_561_default[i,6] > 0.95) {
      categories[i] <- "Vert"
    }
  }
}

write(categories, file = "output_list_K6_561_LD_default_95.txt")

# De même pour un seuil de 0.90
categories <- character(nrow(Q_6_561_default))
categories <- rep("-", nrow(Q_6_561_default))

for (i in 1:nrow(Q_6_561_default)) {
  # Vérifier si aucune valeur dans la ligne ne dépasse 0.9
  if (all(Q_6_561_default[i,] <= 0.9)) {
    categories[i] <- "-"
  } else {
    # Vérifier quelle colonne a la valeur supérieure à 0.9
    if (Q_6_561_default[i,1] > 0.9) {
      categories[i] <- "Bleu"
    } else if (Q_6_561_default[i,2] > 0.9) {
      categories[i] <- "Rouge"
    } else if (Q_6_561_default[i,3] > 0.9) {
      categories[i] <- "Noir"
    } else if (Q_6_561_default[i,4] > 0.9) {
      categories[i] <- "Jaune"
    } else if (Q_6_561_default[i,5] > 0.9) {
      categories[i] <- "Orange"
    } else if (Q_6_561_default[i,6] > 0.9) {
      categories[i] <- "Vert"
    }
  }
}

write(categories, file = "output_list_K6_561_LD_default_90.txt")
setwd("~/Documents/Stage_NB/data/Qfiles")

# Les individus du projet BeeMuSe ont leurs fonds génétiques inconnus, on indique cela par '-'
# seuil 0.95
output_list_K6_561_default_95_merged <- readLines("output_list_K6_561_LD_default_95_merged.txt")
output_list_K6_561_default_95_merged <- gsub("Beemuse", "-", output_list_K6_561_default_95_merged)
writeLines(output_list_K6_561_default_95_merged, "merged_data_K6_561_LD_default_95.pop")

# seuil 0.90
output_list_K6_561_default_90_merged <- readLines("output_list_K6_561_LD_default_90_merged.txt")
output_list_K6_561_default_90_merged <- gsub("Beemuse", "-", output_list_K6_561_default_90_merged)
writeLines(output_list_K6_561_default_90_merged, "merged_data_K6_561_LD_default_90.pop")
# Nombre d'individus de référence pour chaque population pure ou de fond génétique inconnu, indiqué dans le fichier .pop lors de l'admixture supervisée
# seuil 0.95
texte_complet <- paste(output_list_K6_561_default_95_merged, collapse = " ")
K6_95 <- unlist(strsplit(texte_complet, "\\s+"))
nombre_apparitions <- table(K6_95)
print(nombre_apparitions)
## K6_95
##      -   Bleu  Jaune   Noir Orange  Rouge   Vert 
##   1211     12     18     11     20     22     15
# seuil 0.90
texte_complet <- paste(output_list_K6_561_default_90_merged, collapse = " ")
K6_90 <- unlist(strsplit(texte_complet, "\\s+"))
nombre_apparitions <- table(K6_90)
print(nombre_apparitions)
## K6_90
##      -   Bleu  Jaune   Noir Orange  Rouge   Vert 
##   1166     14     21     24     42     26     16
# Définition des valeurs et labels pour le seuil 0.95
valeurs_95 <- c(463, 12, 18, 11, 20, 22, 15)
labels_95 <- c("Unknown", "Blue", "Yellow", "Black", "Orange", "Red", "Green")
df_95 <- data.frame(Reference_Individuals = labels_95, values = valeurs_95)

# Création du graphique (Pie Chart) pour le seuil 0.95
ggplot(df_95, aes(x = "", y = values, fill = Reference_Individuals)) +
  geom_col(width = 1, color = "white") +  
  geom_text(aes(label = values),  color = "white", size = 6,  fontface = "bold",
            position = position_stack(vjust = 0.5)) + 
  coord_polar(theta = "y") +
  scale_fill_manual(values = c("black", "#0603a6", "#018a16", "#FE9001", "#FE0101", "lightgrey", "#EDFE01")) + 
  theme_void() +
  ggtitle("Pie Chart - Seuil 0.95")

# Définition des valeurs et labels pour le seuil 0.90
valeurs_90 <- c(418, 14, 21, 24, 42, 26, 16)
labels_90 <- c("Unknown", "Blue", "Yellow", "Black", "Orange", "Red", "Green")
df_90 <- data.frame(Reference_Individuals = labels_90, values = valeurs_90)

# Création du graphique (Pie Chart) pour le seuil 0.90
ggplot(df_90, aes(x = "", y = values, fill = Reference_Individuals)) +
  geom_col(width = 1, color = "white") +  
  geom_text(aes(label = values),  color = "white", size = 6,  fontface = "bold",
            position = position_stack(vjust = 0.5)) +
  coord_polar(theta = "y") +
  scale_fill_manual(values = c("black", "#0603a6", "#018a16", "#FE9001", "#FE0101", "lightgrey", "#EDFE01")) + 
  theme_void() +
  ggtitle("Pie Chart - Seuil 0.95")

MM_31-20 - Admixture supervisée - K = 3

setwd("~/Documents/Stage_NB/data/Qfiles/merged_data_1055_K3_supervised_561_95")

# Lecture des données depuis le fichier
Q_3_561_default <- read.table("merged_BeeMuSe_SeqApiPop_561_filtered_maf001_LD_default.3.r1.Q", header = FALSE)

# Renommer les colonnes
colnames(Q_3_561_default) <- c("Orange", "Noir", "Vert")

# Créer un dataframe des données correspondants aux individus de famille MM_31-20
data <- data.frame(
  Row = 1:34,
  Vert = c(0.137779, 0.106600, 0.144376, 0.121816, 0.115205, 0.156852, 0.116906, 0.128898, 0.108226, 0.127597,
           0.125377, 0.143687, 0.139994, 0.118624, 0.105885, 0.114958, 0.125339, 0.111632, 0.139381, 0.097184,
           0.103432, 0.127272, 0.146901, 0.258441, 0.131292, 0.121175, 0.153283, 0.121445, 0.106801, 0.148918,
           0.110793, 0.116428, 0.268355, 0.143695),
  Noir = c(0.000010, 0.208670, 0.000010, 0.000010, 0.000010, 0.357705, 0.000010, 0.179942, 0.000010, 0.000010,
           0.000010, 0.000010, 0.005702, 0.201438, 0.209293, 0.198569, 0.183688, 0.181689, 0.000010, 0.189135,
           0.000010, 0.000010, 0.000010, 0.010149, 0.000010, 0.000010, 0.000010, 0.000010, 0.000010, 0.000010,
           0.005139, 0.004824, 0.000010, 0.000010),
  Orange = c(0.862211, 0.684730, 0.855614, 0.878174, 0.884785, 0.485443, 0.883084, 0.691160, 0.891764, 0.872393,
             0.874613, 0.856303, 0.854304, 0.679938, 0.684822, 0.686473, 0.690974, 0.706679, 0.860609, 0.713681,
             0.896558, 0.872718, 0.853089, 0.731410, 0.868698, 0.878815, 0.846707, 0.878545, 0.893189, 0.851072,
             0.884068, 0.878748, 0.731635, 0.856295),
  Name = c("19_E5.CEL", "20_G5.CEL", "35_E10.CEL", "37_I10.CEL", "38_K10.CEL", "39_M10.CEL", "40_O10.CEL", "41_A12.CEL",
        "42_C12.CEL", "49_A14.CEL", "53_I14.CEL", "54_K14.CEL", "56_O14.CEL", "4_H2.CEL", "5_J2.CEL", "6_L2.CEL", "7_N2.CEL",
        "8_P2.CEL", "9_B4.CEL", "16_P4.CEL", "18_D6.CEL", "19_F6.CEL", "23_N6.CEL", "24_P6.CEL", "26_D8.CEL", "28_H8.CEL",
        "29_J8.CEL", "378_H22.CEL", "379_J22.CEL", "380_L22.CEL", "381_N22.CEL", "382_P22.CEL", "384_D24.CEL", "385_F24.CEL")

)

# Convertir les données en un format long
data_long <- pivot_longer(data, cols = c(Orange, Noir, Vert), names_to = "Color", values_to = "Proportion")

# Créer un graphique à barres empilées
ggplot(data_long, aes(x = Name, y = Proportion, fill = Color)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Fonds génétiques - MM_31-20 (ID_2a) - BeeMuSe - K = 3 (seuil 0.95) - 1055 SNPs", y = "Proportion", x = "Individu") +
  scale_fill_manual(values = c(Vert = "#018a16", Noir = "black", Orange = "#FE9001")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_y_continuous(breaks = seq(0, 1, 0.1)) +
  coord_flip()

Légende - CV plot error - Admixture

SeqApiPop 629 échantillons - MAF > 0.01 - LD pruning = 0.3 (fenêtre de 1749 SNPs et pas de 175 bp)

valeurs <- c(0.475215, 0.4747, 0.47582, 0.4794, 0.4888, 0.5012)

bp <- barplot(valeurs, horiz = TRUE, xlim = c(0.47, 0.51), width = 0.1,
        xlab = "CV", las = 1, col = "darkblue", border = NA)

abline(v = seq(0.47, 0.51, by = 0.005), col = "lightgray", lty = 2)

SeqApiPop 629 échantillons - SNPsBeeMuSe filtered - 10030 SNPS

valeurs <- c(0.7218, 0.7219, 0.7242, 0.7282, 0.7405, 0.7955)

bp <- barplot(valeurs, horiz = TRUE, xlim = c(0.72, 0.8), width = 0.1,
        xlab = "CV", las = 1, col = "darkblue", border = NA)

abline(v = seq(0.72, 0.8, by = 0.01), col = "lightgray", lty = 2)

SeqApiPop 629 échantillons - SNPsBeeMuSe filtered - MAF > 0.01 - LD pruning = 0.1 (fenêtre de 50 SNPs et pas de 10 bp) - 1055 SNPs

valeurs <- c(0.7593, 0.7595, 0.7669, 0.7815, 0.8179)

bp <- barplot(valeurs, horiz = TRUE, xlim = c(0.75, 0.82), width = 0.1,
        xlab = "CV", las = 1, col = "darkblue", border = NA)

abline(v = seq(0.75, 0.82, by = 0.01), col = "lightgray", lty = 2)

SeqApiPop 561 échantillons - SNPsBeeMuSe filtered - MAF > 0.01 - LD pruning = 0.1 (fenêtre de 50 SNPs et pas de 10 bp) - 1055 SNPS

valeurs <- c(0.7734 ,0.7709, 0.7680, 0.7715, 0.7871, 0.8271)

bp <- barplot(valeurs, horiz = TRUE, xlim = c(0.76, 0.83), width = 0.1,
        xlab = "CV", las = 1, col = "darkblue", border = NA)

abline(v = seq(0.76, 0.83, by = 0.01), col = "lightgray", lty = 2)